from IPython import display
display.Image("image.jpg")
### Pandas and Numpy
import pandas as pd
import numpy as np
### Visualisation libraries
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
### For Q-Q Plot
import scipy.stats as stats
### To ignore warnings
import warnings
warnings.filterwarnings('ignore')
### Machine Learning libraries
import sklearn
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.metrics import r2_score
### To be able to see maximum columns on screen
pd.set_option('display.max_columns', 500)
dataset=pd.read_csv('cardekho_dataset.csv', usecols=['car_name', 'brand', 'model', 'vehicle_age', 'km_driven',
'seller_type', 'fuel_type', 'transmission_type', 'mileage', 'engine',
'max_power', 'seats', 'selling_price'])
dataset.head()
| car_name | brand | model | vehicle_age | km_driven | seller_type | fuel_type | transmission_type | mileage | engine | max_power | seats | selling_price | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Maruti Alto | Maruti | Alto | 9 | 120000 | Individual | Petrol | Manual | 19.70 | 796 | 46.30 | 5 | 120000 |
| 1 | Hyundai Grand | Hyundai | Grand | 5 | 20000 | Individual | Petrol | Manual | 18.90 | 1197 | 82.00 | 5 | 550000 |
| 2 | Hyundai i20 | Hyundai | i20 | 11 | 60000 | Individual | Petrol | Manual | 17.00 | 1197 | 80.00 | 5 | 215000 |
| 3 | Maruti Alto | Maruti | Alto | 9 | 37000 | Individual | Petrol | Manual | 20.92 | 998 | 67.10 | 5 | 226000 |
| 4 | Ford Ecosport | Ford | Ecosport | 6 | 30000 | Dealer | Diesel | Manual | 22.77 | 1498 | 98.59 | 5 | 570000 |
# getting null values and datatypes of all features
dataset.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 15411 entries, 0 to 15410 Data columns (total 13 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 car_name 15411 non-null object 1 brand 15411 non-null object 2 model 15411 non-null object 3 vehicle_age 15411 non-null int64 4 km_driven 15411 non-null int64 5 seller_type 15411 non-null object 6 fuel_type 15411 non-null object 7 transmission_type 15411 non-null object 8 mileage 15411 non-null float64 9 engine 15411 non-null int64 10 max_power 15411 non-null float64 11 seats 15411 non-null int64 12 selling_price 15411 non-null int64 dtypes: float64(2), int64(5), object(6) memory usage: 1.5+ MB
### getting info about features
dataset.describe(include='all').T
| count | unique | top | freq | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| car_name | 15411 | 121 | Hyundai i20 | 906 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| brand | 15411 | 32 | Maruti | 4992 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| model | 15411 | 120 | i20 | 906 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| vehicle_age | 15411.0 | NaN | NaN | NaN | 6.036338 | 3.013291 | 0.0 | 4.0 | 6.0 | 8.0 | 29.0 |
| km_driven | 15411.0 | NaN | NaN | NaN | 55616.480631 | 51618.548422 | 100.0 | 30000.0 | 50000.0 | 70000.0 | 3800000.0 |
| seller_type | 15411 | 3 | Dealer | 9539 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| fuel_type | 15411 | 5 | Petrol | 7643 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| transmission_type | 15411 | 2 | Manual | 12225 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| mileage | 15411.0 | NaN | NaN | NaN | 19.701151 | 4.171265 | 4.0 | 17.0 | 19.67 | 22.7 | 33.54 |
| engine | 15411.0 | NaN | NaN | NaN | 1486.057751 | 521.106696 | 793.0 | 1197.0 | 1248.0 | 1582.0 | 6592.0 |
| max_power | 15411.0 | NaN | NaN | NaN | 100.588254 | 42.972979 | 38.4 | 74.0 | 88.5 | 117.3 | 626.0 |
| seats | 15411.0 | NaN | NaN | NaN | 5.325482 | 0.807628 | 0.0 | 5.0 | 5.0 | 5.0 | 9.0 |
| selling_price | 15411.0 | NaN | NaN | NaN | 774971.11641 | 894128.363263 | 40000.0 | 385000.0 | 556000.0 | 825000.0 | 39500000.0 |
### checking duplicate values
dataset.duplicated().sum()
167
### dropping duplicate values
dataset.drop_duplicates(inplace=True)
### checking duplicate values after dropping duplicates
dataset.duplicated().sum()
0
### getting count of features
categorical_features=[feature for feature in dataset.columns if dataset[feature].dtypes == 'O']
print("There are {} categorical features namely: {}\n".format(len(categorical_features),categorical_features))
numerical_features=[feature for feature in dataset.columns if dataset[feature].dtypes != 'O']
print("There are {} numerical features namely: {}".format(len(numerical_features),numerical_features))
There are 6 categorical features namely: ['car_name', 'brand', 'model', 'seller_type', 'fuel_type', 'transmission_type'] There are 7 numerical features namely: ['vehicle_age', 'km_driven', 'mileage', 'engine', 'max_power', 'seats', 'selling_price']
### Getting count of categories in each features
for feature in categorical_features:
print("The {} feature has '{}' no. of different categories".format(feature, dataset[feature].nunique()))
The car_name feature has '121' no. of different categories The brand feature has '32' no. of different categories The model feature has '120' no. of different categories The seller_type feature has '3' no. of different categories The fuel_type feature has '5' no. of different categories The transmission_type feature has '2' no. of different categories
### top 10 categories in each feature percent wise
for feature in categorical_features:
print(dataset[feature].value_counts(normalize=True)[:10]*100)
print("*******************************************")
Hyundai i20 5.890842 Maruti Swift Dzire 5.739963 Maruti Swift 5.077408 Maruti Alto 5.038048 Honda City 4.919969 Maruti Wagon R 4.651010 Hyundai Grand 3.732616 Toyota Innova 3.568617 Hyundai Verna 3.201260 Hyundai i10 2.663343 Name: car_name, dtype: float64 ******************************************* Maruti 32.360273 Hyundai 19.364996 Honda 9.682498 Mahindra 6.553398 Toyota 5.175807 Ford 5.090527 Volkswagen 4.027814 Renault 3.457098 BMW 2.860142 Tata 2.761742 Name: brand, dtype: float64 ******************************************* i20 5.890842 Swift Dzire 5.739963 Swift 5.077408 Alto 5.038048 City 4.919969 Wagon R 4.651010 Grand 3.732616 Innova 3.568617 Verna 3.201260 i10 2.663343 Name: model, dtype: float64 ******************************************* Dealer 62.050643 Individual 36.814484 Trustmark Dealer 1.134873 Name: seller_type, dtype: float64 ******************************************* Petrol 49.560483 Diesel 48.163212 CNG 1.961427 LPG 0.288638 Electric 0.026240 Name: fuel_type, dtype: float64 ******************************************* Manual 79.336132 Automatic 20.663868 Name: transmission_type, dtype: float64 *******************************************
### Brands wrt car count
pd.DataFrame(dataset.groupby('brand')['brand'].count().sort_values(ascending=False)).rename(columns={'brand':'Car Count'})
| Car Count | |
|---|---|
| brand | |
| Maruti | 4933 |
| Hyundai | 2952 |
| Honda | 1476 |
| Mahindra | 999 |
| Toyota | 789 |
| Ford | 776 |
| Volkswagen | 614 |
| Renault | 527 |
| BMW | 436 |
| Tata | 421 |
| Skoda | 333 |
| Mercedes-Benz | 332 |
| Audi | 190 |
| Datsun | 170 |
| Jaguar | 58 |
| Land Rover | 50 |
| Jeep | 41 |
| Kia | 32 |
| Porsche | 21 |
| Volvo | 20 |
| MG | 19 |
| Mini | 17 |
| Nissan | 11 |
| Lexus | 9 |
| Isuzu | 7 |
| Bentley | 3 |
| Maserati | 2 |
| ISUZU | 2 |
| Mercedes-AMG | 1 |
| Rolls-Royce | 1 |
| Force | 1 |
| Ferrari | 1 |
### Top 10 brands wrt sales
plt.figure(figsize=(10,6))
sns.barplot(y=dataset['brand'].value_counts()[:10], x=dataset['brand'].value_counts().index[:10])
plt.ylabel('Count of Sales', fontsize=15, fontweight='bold')
plt.xlabel("Brand",fontsize=15, fontweight='bold')
plt.xticks(rotation=45)
plt.title("Top 10 brands wrt Sales",fontsize=15, fontweight='bold')
plt.show();
### Bottom 10 brands wrt sales
plt.figure(figsize=(10,6))
sns.barplot(y=dataset['brand'].value_counts()[-10:], x=dataset['brand'].value_counts().index[-10:])
plt.ylabel('Count of Sales', fontsize=15, fontweight='bold')
plt.xlabel("Brand",fontsize=15, fontweight='bold')
plt.xticks(rotation=45)
plt.title("Bottom 10 brands wrt Sales",fontsize=15, fontweight='bold')
plt.show();
### Top 10 Cars
pd.DataFrame(dataset.groupby('car_name')['car_name'].count().sort_values(ascending=False)[:10]).rename(columns={'car_name':'Car Count'})
| Car Count | |
|---|---|
| car_name | |
| Hyundai i20 | 898 |
| Maruti Swift Dzire | 875 |
| Maruti Swift | 774 |
| Maruti Alto | 768 |
| Honda City | 750 |
| Maruti Wagon R | 709 |
| Hyundai Grand | 569 |
| Toyota Innova | 544 |
| Hyundai Verna | 488 |
| Hyundai i10 | 406 |
### Bottom 10 Cars
pd.DataFrame(dataset.groupby('car_name')['car_name'].count().sort_values(ascending=False)[-10:]).rename(columns={'car_name':'Car Count'})
| Car Count | |
|---|---|
| car_name | |
| ISUZU MUX | 2 |
| Isuzu MUX | 1 |
| Hyundai Aura | 1 |
| Mercedes-AMG C | 1 |
| Maserati Quattroporte | 1 |
| Maserati Ghibli | 1 |
| Rolls-Royce Ghost | 1 |
| Ferrari GTC4Lusso | 1 |
| Force Gurkha | 1 |
| Tata Altroz | 1 |
### Top 10 selling cars countwise
plt.figure(figsize=(10,6))
sns.barplot(y=dataset['car_name'].value_counts()[:10], x=dataset['car_name'].value_counts().index[:10])
plt.ylabel('Count of Sales', fontsize=15, fontweight='bold')
plt.xlabel("Car",fontsize=15, fontweight='bold')
plt.xticks(rotation=45)
plt.title("Top 10 selling cars countwise",fontsize=15, fontweight='bold')
plt.show();
### Bottom 10 selling cars countwise
plt.figure(figsize=(10,6))
sns.barplot(y=dataset['car_name'].value_counts()[-10:], x=dataset['car_name'].value_counts().index[-10:])
plt.ylabel('Count of Sales', fontsize=15, fontweight='bold')
plt.xlabel("Car",fontsize=15, fontweight='bold')
plt.xticks(rotation=45)
plt.title("Bottom 10 selling cars countwise",fontsize=15, fontweight='bold')
plt.show();
### Brands wrt Total Selling Price
pd.DataFrame(dataset.groupby('brand')['selling_price'].sum().sort_values(ascending=False)).rename(columns={'selling_price':'Total Selling Price'})
| Total Selling Price | |
|---|---|
| brand | |
| Maruti | 2402298875 |
| Hyundai | 1701766000 |
| BMW | 1171390000 |
| Toyota | 1082111000 |
| Honda | 911360000 |
| Mercedes-Benz | 823437000 |
| Mahindra | 787825000 |
| Ford | 491764000 |
| Audi | 374443000 |
| Volkswagen | 316529000 |
| Tata | 288200000 |
| Skoda | 261486000 |
| Renault | 233213000 |
| Land Rover | 192344000 |
| Jaguar | 153539000 |
| Porsche | 108385000 |
| Volvo | 74594000 |
| Jeep | 73628000 |
| Kia | 55528000 |
| Datsun | 54488000 |
| Lexus | 47365000 |
| Ferrari | 39500000 |
| Mini | 37105000 |
| MG | 33306000 |
| Bentley | 27800000 |
| Rolls-Royce | 24200000 |
| Maserati | 12200000 |
| Nissan | 10509000 |
| Isuzu | 9640000 |
| Mercedes-AMG | 5100000 |
| ISUZU | 3795000 |
| Force | 700000 |
### Top 10 brands
plt.figure(figsize=(10,6))
sns.barplot(y=dataset.groupby('brand')['selling_price'].sum().sort_values(ascending=False)[:10],
x=dataset.groupby('brand')['selling_price'].sum().sort_values(ascending=False).index[:10])
plt.ylabel('Total Sales in Rupees',fontsize=15, fontweight='bold')
plt.xlabel("Brand",fontsize=15, fontweight='bold')
plt.xticks(rotation=45)
plt.title("Top 10 brands wrt Total Selling price",fontsize=15, fontweight='bold')
plt.show();
### Bottom 10 brands
plt.figure(figsize=(10,6))
sns.barplot(y=dataset.groupby('brand')['selling_price'].sum().sort_values(ascending=False)[-10:],
x=dataset.groupby('brand')['selling_price'].sum().sort_values(ascending=False).index[-10:])
plt.ylabel('Total Sales in Rupees',fontsize=15, fontweight='bold')
plt.xlabel("Brand",fontsize=15, fontweight='bold')
plt.xticks(rotation=45)
plt.title("Bottom 10 brands wrt Total Selling price",fontsize=15, fontweight='bold')
plt.show();
### Top 10 Cars wrt Total Selling Price
pd.DataFrame(dataset.groupby('car_name')['selling_price'].sum().sort_values(ascending=False)[:10]).rename(columns={'selling_price':'Total Selling Price'})
| Total Selling Price | |
|---|---|
| car_name | |
| Toyota Innova | 639831000 |
| Hyundai i20 | 487790000 |
| Honda City | 468565000 |
| Maruti Swift Dzire | 459811000 |
| Maruti Swift | 365551000 |
| Toyota Fortuner | 364188000 |
| Hyundai Creta | 343551000 |
| BMW 5 | 337804000 |
| Mahindra XUV500 | 329429000 |
| Hyundai Verna | 318210000 |
### Bottom 10 Cars wrt Total Selling Price
pd.DataFrame(dataset.groupby('car_name')['selling_price'].sum().sort_values(ascending=False)[-10:]).rename(columns={'selling_price':'Toatl Selling Price'})
| Toatl Selling Price | |
|---|---|
| car_name | |
| Honda CR | 4415000 |
| ISUZU MUX | 3795000 |
| Datsun redi-GO | 3212000 |
| Isuzu MUX | 2300000 |
| Maruti Dzire ZXI | 2200000 |
| Nissan X-Trail | 2135000 |
| Hyundai Aura | 900000 |
| Maruti Dzire LXI | 885000 |
| Tata Altroz | 730000 |
| Force Gurkha | 700000 |
### Top 10 selling cars
plt.figure(figsize=(10,6))
sns.barplot(y=dataset.groupby('car_name')['selling_price'].sum().sort_values(ascending=False)[:10],
x=dataset.groupby('car_name')['selling_price'].sum().sort_values(ascending=False).index[:10])
plt.ylabel('Total Sales in Rupees',fontsize=15, fontweight='bold')
plt.xlabel("Car",fontsize=15, fontweight='bold')
plt.xticks(rotation=45)
plt.title("Top 10 cars wrt total Selling price",fontsize=15, fontweight='bold')
plt.show();
### Bottom 10 selling cars
plt.figure(figsize=(10,6))
sns.barplot(y=dataset.groupby('car_name')['selling_price'].sum().sort_values(ascending=False)[-10:],
x=dataset.groupby('car_name')['selling_price'].sum().sort_values(ascending=False).index[-10:])
plt.ylabel('Total Sales in Rupees',fontsize=15, fontweight='bold')
plt.xlabel("Car",fontsize=15, fontweight='bold')
plt.xticks(rotation=45)
plt.title("Bottom 10 cars wrt Total Selling price",fontsize=15, fontweight='bold')
plt.show();
### Brands wrt Costliest Car
pd.DataFrame(dataset.groupby('brand')['selling_price'].max().sort_values(ascending=False)).rename(columns={'selling_price':'Costliest Car Price'})
| Costliest Car Price | |
|---|---|
| brand | |
| Ferrari | 39500000 |
| Rolls-Royce | 24200000 |
| Bentley | 14500000 |
| Mercedes-Benz | 13000000 |
| Porsche | 11100000 |
| Land Rover | 9200000 |
| BMW | 8500000 |
| Volvo | 8195000 |
| Lexus | 8000000 |
| Audi | 6800000 |
| Jaguar | 6300000 |
| Maserati | 6200000 |
| Jeep | 5600000 |
| Mercedes-AMG | 5100000 |
| Mini | 3875000 |
| Toyota | 3650000 |
| Skoda | 3550000 |
| Kia | 3525000 |
| Ford | 3200000 |
| Honda | 3200000 |
| Mahindra | 2950000 |
| Hyundai | 2600000 |
| Isuzu | 2300000 |
| MG | 2075000 |
| ISUZU | 1900000 |
| Tata | 1750000 |
| Nissan | 1450000 |
| Volkswagen | 1250000 |
| Maruti | 1225000 |
| Renault | 1155000 |
| Force | 700000 |
| Datsun | 650000 |
### Top 10 brand wrt Costliest car
plt.figure(figsize=(10,6))
sns.barplot(y=dataset.groupby('brand').selling_price.max().sort_values(ascending=False)[:10],
x=dataset.groupby('brand').selling_price.max().sort_values(ascending=False).index[:10])
plt.ylabel('Price of Costliest Car in Rupees',fontsize=15, fontweight='bold')
plt.xlabel("Brand",fontsize=15, fontweight='bold')
plt.xticks(rotation=45)
plt.title("Top 10 brands wrt Costliest Car",fontsize=15, fontweight='bold')
plt.show();
### Bottom 10 brand wrt Costliest cars
plt.figure(figsize=(10,6))
sns.barplot(y=dataset.groupby('brand').selling_price.max().sort_values(ascending=False)[-10:],
x=dataset.groupby('brand').selling_price.max().sort_values(ascending=False).index[-10:])
plt.ylabel('Price of Costliest Car in Rupees',fontsize=15, fontweight='bold')
plt.xlabel("Brand",fontsize=15, fontweight='bold')
plt.xticks(rotation=45)
plt.title("Bottom 10 brands wrt Costliest Car",fontsize=15, fontweight='bold')
plt.show();
### Top 10 Costliest Car
pd.DataFrame(dataset.groupby('car_name')['selling_price'].max().sort_values(ascending=False)[:10]).rename(columns={'selling_price':'Car Price'})
| Car Price | |
|---|---|
| car_name | |
| Ferrari GTC4Lusso | 39500000 |
| Rolls-Royce Ghost | 24200000 |
| Bentley Continental | 14500000 |
| Mercedes-Benz S-Class | 13000000 |
| Porsche Cayenne | 11100000 |
| Land Rover Rover | 9200000 |
| BMW 7 | 8500000 |
| BMW Z4 | 8250000 |
| Volvo XC | 8195000 |
| BMW X5 | 8100000 |
### Bottom 10 Costliest Car
pd.DataFrame(dataset.groupby('car_name')['selling_price'].max().sort_values(ascending=False)[-10:]).rename(columns={'selling_price':'Car Price'})
| Car Price | |
|---|---|
| car_name | |
| Maruti Wagon R | 625000 |
| Maruti Celerio | 595000 |
| Maruti S-Presso | 550000 |
| Renault KWID | 550000 |
| Maruti Dzire LXI | 500000 |
| Hyundai i10 | 500000 |
| Maruti Eeco | 490000 |
| Maruti Alto | 485000 |
| Datsun redi-GO | 435000 |
| Datsun RediGO | 425000 |
### Top 10 costliest cars
plt.figure(figsize=(10,6))
sns.barplot(y=dataset.groupby('car_name').selling_price.max().sort_values(ascending=False)[:10],
x=dataset.groupby('car_name').selling_price.max().sort_values(ascending=False).index[:10:])
plt.ylabel('Price of Car in Rupees',fontsize=15, fontweight='bold')
plt.xlabel("Car",fontsize=15, fontweight='bold')
plt.xticks(rotation=45)
plt.title("Top 10 Costliest Cars",fontsize=15, fontweight='bold')
plt.show();
### Bottom 10 costliest cars
plt.figure(figsize=(10,6))
sns.barplot(y=dataset.groupby('car_name').selling_price.max().sort_values(ascending=False)[-10:],
x=dataset.groupby('car_name').selling_price.max().sort_values(ascending=False).index[-10:])
plt.ylabel('Price of Car in Rupees',fontsize=15, fontweight='bold')
plt.xlabel("Car",fontsize=15, fontweight='bold')
plt.xticks(rotation=45)
plt.title("Bottom 10 Costliest Cars",fontsize=15, fontweight='bold')
plt.show();
### Brand wrt Average Mileage
pd.DataFrame(dataset.groupby('brand')['mileage'].mean().sort_values(ascending=False)).rename(columns={'mileage':'Average mileage'})
| Average mileage | |
|---|---|
| brand | |
| Maruti | 22.425520 |
| Renault | 22.073586 |
| Datsun | 21.215647 |
| Lexus | 20.676667 |
| Ford | 19.970335 |
| Honda | 19.901463 |
| Maserati | 19.820000 |
| Tata | 19.780855 |
| Hyundai | 19.590650 |
| Volkswagen | 18.690977 |
| Mini | 18.287647 |
| Skoda | 17.663904 |
| BMW | 17.450046 |
| Kia | 17.323125 |
| Force | 17.000000 |
| Jeep | 16.236585 |
| Nissan | 16.151818 |
| Audi | 16.119737 |
| Jaguar | 16.085172 |
| Mahindra | 15.864314 |
| MG | 15.620526 |
| Volvo | 14.860000 |
| ISUZU | 13.800000 |
| Mercedes-Benz | 13.531054 |
| Porsche | 13.515714 |
| Toyota | 13.207414 |
| Land Rover | 13.038400 |
| Isuzu | 12.600000 |
| Mercedes-AMG | 11.900000 |
| Rolls-Royce | 10.200000 |
| Bentley | 8.033333 |
| Ferrari | 4.000000 |
### Top 10 brand wrt Mileage of cars
plt.figure(figsize=(10,6))
sns.barplot(y=dataset.groupby('brand').mileage.mean().sort_values(ascending=False)[:10],
x=dataset.groupby('brand').mileage.mean().sort_values(ascending=False).index[:10])
plt.ylabel('Mileage of Cars in KM/Lt',fontsize=15, fontweight='bold')
plt.xlabel("Brand",fontsize=15, fontweight='bold')
plt.xticks(rotation=45)
plt.title("Top 10 brands wrt Mileage of Car",fontsize=15, fontweight='bold')
plt.show();
### Top 10 brand wrt Mileage of cars
plt.figure(figsize=(10,6))
sns.barplot(y=dataset.groupby('brand').mileage.mean().sort_values(ascending=False)[-10:],
x=dataset.groupby('brand').mileage.mean().sort_values(ascending=False).index[-10:])
plt.ylabel('Mileage of Cars in KM/Lt',fontsize=15, fontweight='bold')
plt.xlabel("Brand",fontsize=15, fontweight='bold')
plt.xticks(rotation=45)
plt.title("Bottom 10 brands wrt Mileage of Car",fontsize=15, fontweight='bold')
plt.show();
### Top 10 car wrt Mileage
pd.DataFrame(dataset.groupby('brand')['mileage'].mean().sort_values(ascending=False)[:10]).rename(columns={'mileage':'Average mileage'})
| Average mileage | |
|---|---|
| brand | |
| Maruti | 22.425520 |
| Renault | 22.073586 |
| Datsun | 21.215647 |
| Lexus | 20.676667 |
| Ford | 19.970335 |
| Honda | 19.901463 |
| Maserati | 19.820000 |
| Tata | 19.780855 |
| Hyundai | 19.590650 |
| Volkswagen | 18.690977 |
### Bottom 10 car wrt Mileage
pd.DataFrame(dataset.groupby('brand')['mileage'].mean().sort_values(ascending=False)[-10:]).rename(columns={'mileage':'Average mileage'})
| Average mileage | |
|---|---|
| brand | |
| ISUZU | 13.800000 |
| Mercedes-Benz | 13.531054 |
| Porsche | 13.515714 |
| Toyota | 13.207414 |
| Land Rover | 13.038400 |
| Isuzu | 12.600000 |
| Mercedes-AMG | 11.900000 |
| Rolls-Royce | 10.200000 |
| Bentley | 8.033333 |
| Ferrari | 4.000000 |
### Top 10 car wrt Mileage
plt.figure(figsize=(10,6))
sns.barplot(y=dataset.groupby('car_name').mileage.mean().sort_values(ascending=False)[:10],
x=dataset.groupby('car_name').mileage.mean().sort_values(ascending=False).index[:10])
plt.ylabel('Mileage of Cars in KM/Lt',fontsize=15, fontweight='bold')
plt.xlabel("Car",fontsize=15, fontweight='bold')
plt.xticks(rotation=45)
plt.title("Top 10 Car wrt Mileage",fontsize=15, fontweight='bold')
plt.show();
### bottom 10 car wrt Mileage
plt.figure(figsize=(10,6))
sns.barplot(y=dataset.groupby('car_name').mileage.mean().sort_values(ascending=False)[-10:],
x=dataset.groupby('car_name').mileage.mean().sort_values(ascending=False).index[-10:])
plt.ylabel('Mileage of Cars in KM/Lt',fontsize=15, fontweight='bold')
plt.xlabel("Car",fontsize=15, fontweight='bold')
plt.xticks(rotation=45)
plt.title("Bottom 10 Car wrt Mileage",fontsize=15, fontweight='bold')
plt.show();
### Brands wrt Mean Max Power
pd.DataFrame(dataset.groupby('brand')['max_power'].mean().sort_values(ascending=False)).rename(columns={'max_power':'Mean Max Power'})
| Mean Max Power | |
|---|---|
| brand | |
| Ferrari | 601.000000 |
| Bentley | 592.666667 |
| Rolls-Royce | 563.000000 |
| Mercedes-AMG | 362.070000 |
| Porsche | 299.028571 |
| Maserati | 270.940000 |
| Lexus | 219.912222 |
| Mercedes-Benz | 210.780120 |
| Land Rover | 210.237400 |
| BMW | 207.604266 |
| Jaguar | 205.987414 |
| Volvo | 205.350000 |
| Audi | 181.430158 |
| ISUZU | 175.000000 |
| Jeep | 172.375610 |
| Mini | 151.167059 |
| MG | 150.829474 |
| Isuzu | 139.795714 |
| Toyota | 135.806793 |
| Kia | 130.025000 |
| Skoda | 126.194414 |
| Nissan | 122.119091 |
| Tata | 109.823159 |
| Mahindra | 109.177568 |
| Honda | 104.340732 |
| Ford | 95.931198 |
| Volkswagen | 92.629837 |
| Hyundai | 92.459993 |
| Force | 80.800000 |
| Renault | 75.055579 |
| Maruti | 74.530935 |
| Datsun | 62.645294 |
### Top 10 Brands wrt Mean Max Power
plt.figure(figsize=(10,6))
sns.barplot(y=dataset.groupby('brand').max_power.mean().sort_values(ascending=False)[:10],
x=dataset.groupby('brand').max_power.mean().sort_values(ascending=False).index[:10])
plt.ylabel('Mean max Power of Cars',fontsize=15, fontweight='bold')
plt.xlabel("Brand",fontsize=15, fontweight='bold')
plt.xticks(rotation=45)
plt.title("Top 10 Brand wrt Mean Max Power",fontsize=15, fontweight='bold')
plt.show();
### Bottom 10 Brands wrt Mean Max Power
plt.figure(figsize=(10,6))
sns.barplot(y=dataset.groupby('brand').max_power.mean().sort_values(ascending=False)[-10:],
x=dataset.groupby('brand').max_power.mean().sort_values(ascending=False).index[-10:])
plt.ylabel('Mean max Power of Cars',fontsize=15, fontweight='bold')
plt.xlabel("Brand",fontsize=15, fontweight='bold')
plt.xticks(rotation=45)
plt.title("Bottom 10 Brand wrt Mean Max Power",fontsize=15, fontweight='bold')
plt.show();
### Top 10 Cars wrt Max Power
pd.DataFrame(dataset.groupby('car_name')['max_power'].max().sort_values(ascending=False)[:10]).rename(columns={'max_power':'Max Power'})
| Max Power | |
|---|---|
| car_name | |
| Bentley Continental | 626.00 |
| Ferrari GTC4Lusso | 601.00 |
| Rolls-Royce Ghost | 563.00 |
| Porsche Cayenne | 500.00 |
| Mercedes-Benz S-Class | 459.00 |
| BMW 6 | 450.00 |
| BMW 7 | 402.00 |
| Porsche Panamera | 394.30 |
| Mercedes-AMG C | 362.07 |
| Mercedes-Benz GL-Class | 362.00 |
### Bottom 10 Cars wrt Max Power
pd.DataFrame(dataset.groupby('car_name')['max_power'].max().sort_values(ascending=False)[-10:]).rename(columns={'max_power':'Max Power'})
| Max Power | |
|---|---|
| car_name | |
| Datsun GO | 76.43 |
| Maruti Eeco | 73.00 |
| Renault Triber | 72.00 |
| Hyundai Santro | 68.07 |
| Datsun RediGO | 68.00 |
| Maruti Alto | 67.10 |
| Maruti Celerio | 67.05 |
| Datsun redi-GO | 67.05 |
| Renault KWID | 67.00 |
| Maruti S-Presso | 67.00 |
### Top 10 car wrt Max Power
plt.figure(figsize=(10,6))
sns.barplot(y=dataset.groupby('car_name').max_power.max().sort_values(ascending=False)[:10],
x=dataset.groupby('car_name').max_power.max().sort_values(ascending=False).index[:10])
plt.ylabel('max Power of Cars',fontsize=15, fontweight='bold')
plt.xlabel("Car",fontsize=15, fontweight='bold')
plt.xticks(rotation=45)
plt.title("Top 10 Car wrt Max Power",fontsize=15, fontweight='bold')
plt.show();
### Top 10 car wrt Max Power
plt.figure(figsize=(10,6))
sns.barplot(y=dataset.groupby('car_name').max_power.max().sort_values(ascending=False)[-10:],
x=dataset.groupby('car_name').max_power.max().sort_values(ascending=False).index[-10:])
plt.ylabel('max Power of Cars',fontsize=15, fontweight='bold')
plt.xlabel("Car",fontsize=15, fontweight='bold')
plt.xticks(rotation=45)
plt.title("Bottom 10 Car wrt Max Power",fontsize=15, fontweight='bold')
plt.show();
### Brands wrt Mean KM Driven
pd.DataFrame(dataset.groupby('brand')['km_driven'].mean().sort_values(ascending=False)).rename(columns={'km_driven':'Mean KM Driven'})
| Mean KM Driven | |
|---|---|
| brand | |
| Toyota | 91840.925222 |
| Mahindra | 73075.585586 |
| Isuzu | 69876.000000 |
| Land Rover | 65874.760000 |
| Skoda | 64784.492492 |
| Volkswagen | 62849.574919 |
| Volvo | 61232.300000 |
| Force | 60000.000000 |
| Mercedes-Benz | 59471.605422 |
| Audi | 58600.231579 |
| Porsche | 58356.952381 |
| Ford | 58324.350515 |
| ISUZU | 56014.500000 |
| Honda | 53525.912602 |
| Maruti | 51386.572268 |
| BMW | 51094.483945 |
| Hyundai | 50346.743225 |
| Renault | 47803.174573 |
| Tata | 47466.076010 |
| Jaguar | 44499.103448 |
| Nissan | 37853.363636 |
| Jeep | 33401.365854 |
| Datsun | 33003.576471 |
| Mini | 32210.705882 |
| Lexus | 28210.444444 |
| Bentley | 25500.000000 |
| Mercedes-AMG | 24000.000000 |
| Maserati | 12250.000000 |
| MG | 11538.684211 |
| Kia | 9255.437500 |
| Rolls-Royce | 5000.000000 |
| Ferrari | 3800.000000 |
### Top 10 Brands wrt Mean Km Driven
plt.figure(figsize=(10,6))
sns.barplot(y=dataset.groupby('brand').km_driven.mean().sort_values(ascending=False)[:10],
x=dataset.groupby('brand').km_driven.mean().sort_values(ascending=False).index[:10])
plt.ylabel('Mean Km Driven in KM',fontsize=15, fontweight='bold')
plt.xlabel("Brand",fontsize=15, fontweight='bold')
plt.xticks(rotation=45)
plt.title("Top 10 Brand wrt Mean Km Driven",fontsize=15, fontweight='bold')
plt.show();
### Bottom 10 Brands wrt Mean Km Driven
plt.figure(figsize=(10,6))
sns.barplot(y=dataset.groupby('brand').km_driven.mean().sort_values(ascending=False)[-10:],
x=dataset.groupby('brand').km_driven.mean().sort_values(ascending=False).index[-10:])
plt.ylabel('Mean Km Driven in KM',fontsize=15, fontweight='bold')
plt.xlabel("Brand",fontsize=15, fontweight='bold')
plt.xticks(rotation=45)
plt.title("Bottom 10 Brand wrt Mean Km Driven",fontsize=15, fontweight='bold')
plt.show();
### Top 10 car wrt Km Driven
pd.DataFrame(dataset.groupby('car_name')['km_driven'].max().sort_values(ascending=False)[10:]).rename(columns={'km_driven':'Max KM Driven'})
| Max KM Driven | |
|---|---|
| car_name | |
| Ford Figo | 570000 |
| Honda Jazz | 525000 |
| Mahindra Bolero | 500000 |
| Maruti Ciaz | 480000 |
| Maruti Vitara | 480000 |
| ... | ... |
| Maserati Quattroporte | 9500 |
| Rolls-Royce Ghost | 5000 |
| Hyundai Aura | 4500 |
| Tata Altroz | 3800 |
| Ferrari GTC4Lusso | 3800 |
111 rows × 1 columns
### Bottom 10 car wrt Km Driven
pd.DataFrame(dataset.groupby('car_name')['km_driven'].max().sort_values(ascending=False)[-10:]).rename(columns={'km_driven':'Max KM Driven'})
| Max KM Driven | |
|---|---|
| car_name | |
| Maruti XL6 | 23000 |
| Maserati Ghibli | 15000 |
| Kia Carnival | 14000 |
| BMW X4 | 14000 |
| Maruti S-Presso | 12000 |
| Maserati Quattroporte | 9500 |
| Rolls-Royce Ghost | 5000 |
| Hyundai Aura | 4500 |
| Tata Altroz | 3800 |
| Ferrari GTC4Lusso | 3800 |
### Top 10 car wrt Km Driven
plt.figure(figsize=(10,6))
sns.barplot(y=dataset.groupby('car_name').km_driven.max().sort_values(ascending=False)[:10],
x=dataset.groupby('car_name').km_driven.max().sort_values(ascending=False).index[:10])
plt.ylabel('Km Driven of Cars',fontsize=15, fontweight='bold')
plt.xlabel("Car",fontsize=15, fontweight='bold')
plt.xticks(rotation=45)
plt.title("Top 10 Car wrt Km Driven",fontsize=15, fontweight='bold')
plt.show();
### Bottom 10 car wrt Km Driven
plt.figure(figsize=(10,6))
sns.barplot(y=dataset.groupby('car_name').km_driven.max().sort_values(ascending=False)[-10:],
x=dataset.groupby('car_name').km_driven.max().sort_values(ascending=False).index[-10:])
plt.ylabel('Km Driven of Cars',fontsize=15, fontweight='bold')
plt.xlabel("Car",fontsize=15, fontweight='bold')
plt.xticks(rotation=45)
plt.title("Bottom 10 Car wrt Km Driven",fontsize=15, fontweight='bold')
plt.show();
### Km driven wrt selling price
plt.figure(figsize=(15,8))
sns.scatterplot(y=dataset['selling_price'], x=dataset['km_driven'], hue=dataset['seller_type'])
plt.ylabel('Selling Price of Cars in Rupees',fontsize=15, fontweight='bold')
plt.xlabel("Km Driven in KM",fontsize=15, fontweight='bold')
plt.xlim(0,800000) #limit is used for scaling the axis
plt.ylim(0,10000000)
plt.title("KM Driven and Seller Type vs Selling Price",fontsize=15, fontweight='bold')
plt.show();
### Km driven wrt selling price
plt.figure(figsize=(15,8))
sns.scatterplot(y=dataset['selling_price'], x=dataset['km_driven'], hue=dataset['transmission_type'])
plt.ylabel('Selling Price of Cars in Rupees',fontsize=15, fontweight='bold')
plt.xlabel("Km Driven in KM",fontsize=15, fontweight='bold')
plt.xlim(0,800000) #limit is used for scaling the axis
plt.ylim(0,10000000)
plt.title("KM Driven and Transmission_type vs Selling Price",fontsize=15, fontweight='bold')
plt.show();
### Km driven wrt selling price
plt.figure(figsize=(15,8))
sns.scatterplot(y=dataset['selling_price'], x=dataset['km_driven'], hue=dataset['fuel_type'])
plt.ylabel('Selling Price of Cars in Rupees',fontsize=15, fontweight='bold')
plt.xlabel("Km Driven in KM",fontsize=15, fontweight='bold')
plt.xlim(0,800000) #limit is used for scaling the axis
plt.ylim(0,10000000)
plt.title("KM Driven and fuel_type vs Selling Price",fontsize=15, fontweight='bold')
plt.show();
### Km driven wrt selling price
plt.figure(figsize=(15,6))
sns.scatterplot(y=dataset['selling_price'], x=dataset['km_driven'], hue=dataset['seats'])
plt.ylabel('Selling Price of Cars in Rupees',fontsize=15, fontweight='bold')
plt.xlabel("Km Driven in KM",fontsize=15, fontweight='bold')
plt.xlim(0,800000) #limit is used for scaling the axis
plt.ylim(0,10000000)
plt.title("KM Driven vs Selling Price",fontsize=15, fontweight='bold')
plt.show();
for i in [feature for feature in numerical_features if feature not in ['selling_price', 'km_driven']]:
for j in ['seller_type', 'fuel_type', 'transmission_type']:
plt.figure(figsize=(15,6))
sns.scatterplot(y=dataset['selling_price'], x=dataset[i], hue=dataset[j])
plt.ylabel('Selling Price of Cars in Rupees',fontsize=15, fontweight='bold')
plt.xlabel("{}".format(i),fontsize=15, fontweight='bold')
plt.ylim(0,10000000)
plt.title("{} and {} vs Selling Price".format(i,j),fontsize=15, fontweight='bold')
plt.show();
### most cars sold on fuel type
plt.figure(figsize=(10,6))
sns.countplot(x=dataset['fuel_type'])
plt.ylabel("Count of Cars sold",fontsize=15, fontweight='bold')
plt.xlabel("Fuel Type",fontsize=15, fontweight='bold')
plt.xticks(rotation=45)
plt.title("Most Sold Car based on Fuel Type",fontsize=15, fontweight='bold')
plt.show();
### fuel type vs total selling price
plt.figure(figsize=(10,6))
sns.barplot(y=dataset.groupby('fuel_type')['selling_price'].sum().sort_values(ascending=False),
x=dataset.groupby('fuel_type')['selling_price'].sum().sort_values(ascending=False).index)
plt.ylabel('Total Selling Price in Rupees',fontsize=15, fontweight='bold')
plt.xlabel("Fuel Type",fontsize=15, fontweight='bold')
plt.xticks(rotation=45)
plt.title("Fuel Type vs Total Selling Price",fontsize=15, fontweight='bold')
plt.show();
### fuel type vs average selling price
plt.figure(figsize=(10,6))
sns.barplot(y=dataset.groupby('fuel_type')['selling_price'].mean().sort_values(ascending=False),
x=dataset.groupby('fuel_type')['selling_price'].mean().sort_values(ascending=False).index)
plt.ylabel('Mean of Selling Price in Rupees',fontsize=15, fontweight='bold')
plt.xlabel("Fuel Type",fontsize=15, fontweight='bold')
plt.xticks(rotation=45)
plt.title("Fuel Type vs Mean Selling Price",fontsize=15, fontweight='bold')
plt.show();
### Fuel Type vs Average Mileage
plt.figure(figsize=(10,6))
sns.barplot(y=dataset['mileage'],
x=dataset['fuel_type'])
plt.ylabel('Mean of Mileage in KM/Lt',fontsize=15, fontweight='bold')
plt.xlabel("Fuel Type",fontsize=15, fontweight='bold')
plt.xticks(rotation=45)
plt.title("Fuel Type vs Mean Mileage",fontsize=15, fontweight='bold')
plt.show();
plt.subplots(figsize=(20,8))
sns.lineplot(x=dataset['vehicle_age'],y=dataset['selling_price'], color='green')
plt.ylabel('Selling_price', fontsize=15, fontweight='bold')
plt.xlabel('vehicle_age', fontsize=15, fontweight='bold')
plt.title('Vehicle Age vs Selling Price', fontsize=15, fontweight='bold')
plt.ylim(0,4000000)
plt.show()
plt.subplots(figsize=(20,8))
sns.lineplot(x=dataset['vehicle_age'],y=dataset['mileage'], color='green')
plt.ylabel('Mileage', fontsize=15, fontweight='bold')
plt.xlabel('vehicle_age', fontsize=15, fontweight='bold')
plt.title('Vehicle Age vs Mileage', fontsize=15, fontweight='bold')
plt.show()
### Checking distribution of numerical features
for feature in numerical_features:
plt.figure(figsize=(15,6))
plt.subplot(121)
sns.histplot(data=dataset, x=feature, kde=True, bins=30)
plt.title("{}'s distribution".format(feature),fontweight="bold", fontsize=15)
plt.subplot(122)
stats.probplot(dataset[feature], dist='norm', plot=plt)
plt.title("{}'s Q-Q Plot".format(feature),fontweight="bold", fontsize=15)
plt.show();
### Checking outliers in numerical features
plt.figure(figsize=(20,25))
for feature in enumerate(numerical_features):
plt.subplot(4, 2, feature[0]+1)
sns.set(rc={'figure.figsize':(10,6)})
sns.boxplot(data=dataset, x=feature[1], color='pink')
plt.title("{}".format(feature[1]), fontweight="bold", fontsize=15)
### plotting regplot for features vs dependent feature
plt.figure(figsize=(20,25))
for feature in enumerate([feature for feature in numerical_features if feature not in ['selling_price']]):
plt.subplot(3, 2, feature[0]+1)
sns.set(rc={'figure.figsize':(8,8)})
sns.regplot(data=dataset, x=feature[1], y='selling_price')
plt.xlabel(feature[1])
plt.ylabel("Selling Price")
plt.title("{} Vs Selling Price".format(feature[1]), fontweight='bold', fontsize=15)
corr=round(dataset[numerical_features].corr(),2)
corr
| vehicle_age | km_driven | mileage | engine | max_power | seats | selling_price | |
|---|---|---|---|---|---|---|---|
| vehicle_age | 1.00 | 0.33 | -0.26 | 0.10 | 0.00 | 0.03 | -0.24 |
| km_driven | 0.33 | 1.00 | -0.10 | 0.19 | 0.04 | 0.19 | -0.08 |
| mileage | -0.26 | -0.10 | 1.00 | -0.63 | -0.53 | -0.44 | -0.31 |
| engine | 0.10 | 0.19 | -0.63 | 1.00 | 0.81 | 0.55 | 0.59 |
| max_power | 0.00 | 0.04 | -0.53 | 0.81 | 1.00 | 0.17 | 0.75 |
| seats | 0.03 | 0.19 | -0.44 | 0.55 | 0.17 | 1.00 | 0.12 |
| selling_price | -0.24 | -0.08 | -0.31 | 0.59 | 0.75 | 0.12 | 1.00 |
### Plotting heatmap for visualising the correlation between features
sns.set(rc={'figure.figsize':(12,7)})
sns.heatmap(data=corr, annot=True, vmin=-1, vmax=1, cmap="YlGnBu")
<AxesSubplot:>